Dataset 4 Integration

Joey Hastings

2023-04-04

Load Packages

library(tictoc)
library(tidyverse)
library(cowplot)
library(ggExtra)
library(ggthemes)
library(ggrepel)
library(splatter)
library(scater)
library(scran)
library(kableExtra)
library(grid)
library(gridExtra)
library(r.jive)
library(Seurat)
library(harmony)

library(RSpectra)
library(scRNAseq)
library(lme4)

library(kBET)    # kBET
library(cluster) # ASW
library(lisi)    # LISI

source("PVCA.R")
source("JIVE/jive_speedup.R", chdir = T)

theme_set(theme_few())

dataset_name <- "dataset4"

Zilionis Mouse Lung Data

We will test the integration methods on the following batches and cell types:

  • Batches: round1_20151128, round2_20151217
  • Cell Types: B cells, T cells
# scRNAseq package
ZilionisLungData <- ZilionisLungData("mouse")
rownames(ZilionisLungData) <- paste0("Gene", 1:nrow(ZilionisLungData)) # for simplicity
colData(ZilionisLungData)$Batch <- factor(colData(ZilionisLungData)$`Library prep batch`)
colData(ZilionisLungData)$Cluster <- factor(colData(ZilionisLungData)$`Major cell type`)

ZilionisLungData <-
  ZilionisLungData[, colData(ZilionisLungData)$Cluster %in% c("B cells", "T cells")]
colData(ZilionisLungData)$Cluster <- factor(colData(ZilionisLungData)$Cluster)

Cell Frequencies by Batch and Cell Cluster

batch_freq <- as.data.frame(table(Batch = ZilionisLungData$Batch))
batch_cluster_freq <-
  as.data.frame(table(
    Batch = ZilionisLungData$Batch,
    Cluster = ZilionisLungData$Cluster
  )) %>%
  pivot_wider(id_cols = c("Batch"), names_from = "Cluster", values_from = "Freq")

batch_cluster_df <- left_join(batch_freq, batch_cluster_freq, by = "Batch")
batch_cluster_df %>%
  filter(Batch %in% c("round1_20151128", "round2_20151217")) %>%
  kbl() %>%
  kable_styling() %>%
  add_header_above(c("Batch" = 2, "Cluster" = ncol(batch_cluster_df) - 2)) %>%
  column_spec(2, border_right = T)
Batch
Cluster
Batch Freq B cells T cells
round1_20151128 1220 641 579
round2_20151217 1313 879 434

Preprocess Data

# Create Seurat object list so all data is preprocessed in the same manner
zilionis_metadata <- as.data.frame(colData(ZilionisLungData))
zilionis_seurat <- CreateSeuratObject(counts(ZilionisLungData), meta.data = zilionis_metadata)

#################
# Identify HVGs #
#################

# Use Seurat feature selection method for selecting highly variable genes

# split the dataset into a list of seurat objects (one for each batch)
zilionis_seurat_batch <- SplitObject(zilionis_seurat, split.by = "Batch")

# normalize and identify variable features for each dataset independently
zilionis_seurat_batch <- lapply(X = zilionis_seurat_batch, FUN = function(x) {
  x <- NormalizeData(x)
  x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 2000)
})

# select features that are repeatedly variable across datasets for integration
features <- SelectIntegrationFeatures(object.list = zilionis_seurat_batch)

#######################################
# Create SCE object for visualization #
#######################################

# Select only HVGs
ZilionisLungData <- ZilionisLungData[features, ]
# Subset to batches of interest
ZilionisLungData <- ZilionisLungData[, colData(ZilionisLungData)$Batch %in% c("round1_20151128", "round2_20151217")]

# Final SCE object
data_SCE <- ZilionisLungData
colData(data_SCE)$Batch <- factor(colData(data_SCE)$Batch)
colData(data_SCE)$Cluster <- factor(colData(data_SCE)$Cluster)

Visualize Raw Counts

data_SCE <- logNormCounts(data_SCE)

# Dimension reduction visualization
set.seed(1)
data_SCE <- runPCA(data_SCE, ncomponents = 30, ntop = 2000)
data_SCE <- runTSNE(data_SCE)
data_SCE <- runUMAP(data_SCE)

orig_title <- labs(title = "Original Data")
brew_colors <- RColorBrewer::brewer.pal(9, "Set1")
p1_options <- scale_color_manual(name = "Batch", values = brew_colors[1:2])
p2_options <- scale_color_manual(name = "Cluster", values = brew_colors[c(3, 4)])

orig_p1 <- plot_grid(
  plotPCA(data_SCE, color_by = "Batch")   + orig_title + p1_options,
  plotPCA(data_SCE, color_by = "Cluster") + p2_options,
  nrow=2, align = "v"
  )

orig_p2 <- plot_grid(
  plotTSNE(data_SCE, color_by = "Batch")   + orig_title + p1_options,
  plotTSNE(data_SCE, color_by = "Cluster") + p2_options,
  nrow=2, align = "v"
  )

orig_p3 <- plot_grid(
  plotUMAP(data_SCE, color_by = "Batch")   + orig_title + p1_options,
  plotUMAP(data_SCE, color_by = "Cluster") + p2_options,
  nrow=2, align = "v"
  )

orig_p1 # ggsave("orig_pca.png", orig_p1)

orig_p2 # ggsave("orig_tsne.png", orig_p2)

orig_p3 # ggsave("orig_umap.png", orig_p3)

PVCA

batches <- data_SCE$Batch
clusters <- data_SCE$Cluster
rawcounts <- as.matrix(logcounts(data_SCE))

# Perform PVCA for 10 random samples of size 1000 (more computationally efficient)
pvca_res <- matrix(nrow = 10, ncol = 3)
meta <- cbind(Batch = batches, Cluster = clusters)

set.seed(1)
for (i in 1:10){
  sample <- sample(1:ncol(rawcounts), 1000, replace = FALSE)
  pvca_res[i, ] <- PVCA(rawcounts[, sample], meta[sample, ], threshold = 0.6, inter = FALSE)
}

# Average effect size across samples
pvca_means <- colMeans(pvca_res)
names(pvca_means) <- c(colnames(meta), "Residual")

# Plot PVCA
pvca_plot <- PlotPVCA(pvca_means, "PVCA of Raw Count Data")
pvca_plot

ggsave("output/pvca_zilionisdata.png", pvca_plot, width = 9, height = 4)

Perform Integration Methods

JIVE

method_name <- "JIVE"

Run Integration Method

n_batches <- length(unique(batches))
unq_batches <- sort(unique(batches))

jive_data <- NULL
all_clusters <- NULL

for (i in 1:n_batches) {
  jive_data[[as.character(unq_batches[i])]] <- t(rawcounts[, batches == unq_batches[i]])
  all_clusters[[as.character(unq_batches[i])]] <- clusters[batches == unq_batches[i]]
}

CORES <- parallel::detectCores()

tic("JIVE v2 runtime (given)")
JIVE_results <-
  jive_v2(
    jive_data,
    rankJ = 10,
    rankA = c(130, 145),
    method = "given",
    maxiter = 10000,
    CORES = CORES
  )
# JIVE v2 runtime (given): 325.61 sec elapsed
# JIVE algorithm converged after  174  iterations.
toc()

saveRDS(JIVE_results, file = "JIVE/data/JIVE_v2_dataset4.rds")

Summary of estimated ranks and variance explained by joint/individual/residual for each approach:

JIVE_results <- readRDS(file = "JIVE/data/JIVE_v2_dataset4.rds")
summary(JIVE_results)
## $Method
## [1] "given"
## 
## $Ranks
##      Source            Rank 
## [1,] "Joint"           "10" 
## [2,] "round1_20151128" "130"
## [3,] "round2_20151217" "145"
## 
## $Variance
##            round1_20151128 round2_20151217
## Joint                0.332           0.341
## Individual           0.328           0.205
## Residual             0.340           0.453
plot(JIVE_results)

joint <- do.call(rbind, JIVE_results$joint)
jive_final <- joint

Visualization of Integration

joint_svd <- svds(joint, k = JIVE_results$rankJ)

var_explained <- data.frame(
  sv = joint_svd$d,
  cum_sv = cumsum(joint_svd$d),
  tot_sv = sum(joint_svd$d)
  ) %>%
  mutate(
    pct_sv = sv / tot_sv,
    cumpct_sv = cum_sv / tot_sv,
    PC = paste0("PC", row_number()),
    PC = factor(PC, levels = paste0("PC", row_number()))
  )

var_explained %>%
  ggplot(aes(x = PC)) +
  geom_col(aes(y = cumpct_sv, fill = PC), color = "black") +
  geom_col(aes(y = cumpct_sv - pct_sv), fill = "gray", color = "black") +
  geom_label(aes(y = cumpct_sv - (pct_sv / 2), label = paste0(round(pct_sv * 100, 2), "%"))) +
  scale_y_continuous(labels = scales::percent) +
  labs(
    x = "", y = "",
    title = "Cumulative Percentage of Total Variance Explained",
    subtitle = "Integrated (Joint) Data"
  )

jive_integrated <- SingleCellExperiment(list(logcounts = t(jive_final)))
colData(jive_integrated)$Batch <- batches
colData(jive_integrated)$Cluster <- clusters

set.seed(1)
# Original PCA based on joint matrices for each batch
jive_integrated <- runPCA(jive_integrated, ncomponents = 30, ntop = 2000)

# t-SNE based on original PC
jive_integrated <- runTSNE(jive_integrated)

# UMAP based on original PC
jive_integrated <- runUMAP(jive_integrated)

plot_title <- labs(
    title = "Integrated Data",
    subtitle = paste0(
      "Joint Rank: ",
      JIVE_results$rankJ,
      ", Individual Ranks: ",
      paste(JIVE_results$rankA, collapse = ", ")
    )
  )

# PCA
jive_pca <- plot_grid(
  plotPCA(jive_integrated, color_by = "Batch")   + plot_title + p1_options,
  plotPCA(jive_integrated, color_by = "Cluster") + p2_options,
  nrow = 2, align = "v"
  )

# jive_pca_2_3 <- plot_grid(
#   plotPCA(jive_integrated, color_by = "Batch",   ncomponents = 2:3) + plot_title + p1_options,
#   plotPCA(jive_integrated, color_by = "Cluster", ncomponents = 2:3) + p2_options,
#   nrow = 2, align = "v"
#   )
# 
# jive_pca_3_4 <- plot_grid(
#   plotPCA(jive_integrated, color_by = "Batch",   ncomponents = 3:4) + plot_title + p1_options,
#   plotPCA(jive_integrated, color_by = "Cluster", ncomponents = 3:4) + p2_options,
#   nrow = 2, align = "v"
#   )
# 
# jive_pca_4_5 <- plot_grid(
#   plotPCA(jive_integrated, color_by = "Batch",   ncomponents = 4:5) + plot_title + p1_options,
#   plotPCA(jive_integrated, color_by = "Cluster", ncomponents = 4:5) + p2_options,
#   nrow = 2, align = "v"
#   )

# t-SNE
jive_tsne <- plot_grid(
  plotTSNE(jive_integrated, color_by = "Batch")   + p1_options,
  plotTSNE(jive_integrated, color_by = "Cluster") + p2_options,
  nrow = 2, align = "v"
  )

# UMAP
jive_umap <- plot_grid(
  plotUMAP(jive_integrated, color_by = "Batch")   + p1_options,
  plotUMAP(jive_integrated, color_by = "Cluster") + p2_options,
  nrow = 2, align = "v"
  )

jive_pca

jive_tsne

jive_umap

kBET

  • Sub-sample data at varying percentages
    • 5%, 10%, 15%, 20%, 25%
    • Stratified sampling so that each batch has the same number of cells selected
      • E.g., assume 2 batches, sampling 10% of 1000 cells = 100 cells: sample 50 cells from each batch
  • Calculate kBET rejection rates at each percentage
    • Lower RR indicate well-mixed batches
    • Higher RR indicate poorly-mixed batches
# data: a matrix (rows: samples, columns: features (genes))
data <- svds(jive_final, k = 30)$u

# batch: vector or factor with batch label of each cell 
batch <- batches

sample_size <- seq(0.05, 0.25, by = 0.05)
rejection_rate <- list()

set.seed(1)
for (i in 1:length(sample_size)) {
  k <- floor(nrow(data) * sample_size[i])
  
  cat("\nSample Size: ", sample_size[i], ", k = ", k, "\n", sep = "")
  cat("------------------------------\n")
 
  batch.estimate <- kBET(data, batch, k0 = k, plot = F, do.pca = F, verbose = T)
  
  cat("\n------------------------------\n")
  
  if (class(batch.estimate) == "list") {
    rejection_rate[[i]] <- batch.estimate$summary %>%
      rownames_to_column(var = "statistic") %>%
      mutate(sample_size = sample_size[i]) %>%
      select(sample_size, statistic, kBET_rr = kBET.observed)
  }
}
## 
## Sample Size: 0.05, k = 126
## ------------------------------
## finding knns...done. Time:
##    user  system elapsed 
##    0.39    0.00    0.40 
## Number of kBET tests is set to 254.
## 
## ------------------------------
## 
## Sample Size: 0.1, k = 253
## ------------------------------
## finding knns...done. Time:
##    user  system elapsed 
##    0.55    0.00    0.55 
## Number of kBET tests is set to 254.
## 
## ------------------------------
## 
## Sample Size: 0.15, k = 379
## ------------------------------
## finding knns...done. Time:
##    user  system elapsed 
##    0.75    0.02    0.77 
## Number of kBET tests is set to 254.
## 
## ------------------------------
## 
## Sample Size: 0.2, k = 506
## ------------------------------
## finding knns...done. Time:
##    user  system elapsed 
##    0.99    0.00    0.99 
## Number of kBET tests is set to 254.
## 
## ------------------------------
## 
## Sample Size: 0.25, k = 633
## ------------------------------
## finding knns...done. Time:
##    user  system elapsed 
##    1.22    0.02    1.25 
## Number of kBET tests is set to 254.
## 
## ------------------------------
jive_kbet <- bind_rows(rejection_rate) %>%
  mutate(Method = method_name) %>%
  pivot_wider(
    id_cols = c("Method", "sample_size"),
    names_from = "statistic",
    values_from = "kBET_rr"
  ) %>%
  select(Method, sample_size, median_rr = `50%`)

jive_kbet %>% write_csv(file = paste0("output/", method_name, "_", dataset_name, "_kBET.csv"))

Average Silhouette Width

Original batch label and cell types/groups will be considered the clusters to compute \(ASW_{batch}\) and \(ASW_{group}\)

  • Compute average silhouette width (ASW) for the first 30 PCs of 80% sub-sample of joint data
    • ASW ranges between -1 and 1
    • Values close to 1 indicate the cell is well-matched to its cluster (i.e., batch label)
    • Values close to -1 indicate the cell is not well-matched to its cluster (i.e., batch label)
      • We hope to see lower values for \(ASW_{batch}\) since it is indicative of well-mixed batches
      • We hope to see higher values for \(ASW_{group}\) since it is indicative that distinct cell groups were preserved after batch-mixing
  • Repeat 20 times
  • Calculate the median ASW from the 20 runs to ensure stability of the measurement
asw_repeat <- 20
npcs <- 30

asw_batch <- list()
asw_cluster <- list()
subset_prop <- 0.8 # subsample to 80% of the data
subset_size_total <- floor(length(batch) * subset_prop)

for (i in 1:asw_repeat) {
  set.seed(i)
  subset_index <- 
    data.frame(index = 1:length(batch) , batch = batch) %>%
    slice_sample(n = subset_size_total, replace = F)
  
  subset_id <- subset_index %>%
    pull(index)
  
  data_pc <- svds(jive_final[subset_id, ], k = npcs)$u
  
  dissimilarity_matrix <- daisy(data_pc)
  
  asw_batch_i <- silhouette(as.integer(factor(batches[subset_id])), dissimilarity_matrix)
  asw_batch[[i]] <- as.data.frame(rbind(summary(asw_batch_i)[["si.summary"]]))
  
  asw_cluster_i <- silhouette(as.integer(factor(clusters[subset_id])), dissimilarity_matrix)
  asw_cluster[[i]] <- as.data.frame(rbind(summary(asw_cluster_i)[["si.summary"]]))
}

jive_asw <- bind_rows(
  bind_rows(asw_batch) %>% mutate(Label = "Batch"),
  bind_rows(asw_cluster) %>% mutate(Label = "Cluster")
  ) %>%
  mutate(Method = method_name) %>%
  select(Method, Label, Mean)

jive_asw %>% write_csv(file = paste0("output/", method_name, "_", dataset_name, "_ASW.csv"))

LISI Scores

# Performed for all cells
jive_lisi <-
  compute_lisi(
    jive_final,
    data.frame(Batch = batches, Cluster = clusters),
    c("Batch", "Cluster"),
    perplexity = 40
  ) %>%
  pivot_longer(
    cols = c("Batch", "Cluster"),
    names_to = "Label",
    values_to = "LISI"
  ) %>%
  mutate(Method = method_name)

jive_lisi %>%
  write_csv(file = paste0("output/", method_name, "_", dataset_name, "_LISI.csv"))

Seurat v3

method_name <- "Seurat"

batch_indices <- which(unique(zilionis_seurat$Batch) %in% c("round1_20151128", "round2_20151217")) # subset batches
zilionis_seurat_batch <- zilionis_seurat_batch[batch_indices]

Run Integration Method

  • Perform integration by finding a set of anchors from the two batches (via CCA)
  • Use the anchor list to integrate the two batches
anchors <- FindIntegrationAnchors(object.list = zilionis_seurat_batch, anchor.features = features)
## Scaling features for provided objects
## Finding all pairwise anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
##  Found 4353 anchors
## Filtering anchors
##  Retained 1974 anchors
seurat_integrated <- IntegrateData(anchorset = anchors)
## Merging dataset 1 into 2
## Extracting anchors for merged samples
## Finding integration vectors
## Finding integration vector weights
## Integrating data
GetAssay(seurat_integrated, "integrated")
## Assay data with 2000 features for 2533 cells
## Top 10 variable features:
##  Gene23012, Gene3205, Gene16391, Gene19729, Gene19727, Gene23010,
## Gene14054, Gene15841, Gene15375, Gene15115
  • Integrated/combined data is centered/scaled
  • First 30 PCs are computed/stored
  • PC1-PC30 are used to perform t-SNE and UMAP dimensionality reduction
  • Calculates k-nearest neighbors (k = 20 by default) from PC1-PC30
  • Identify clusters by using Shared Nearest Neighbors (SNN):
    • “First calculate k-nearest neighbors and construct the SNN graph”
    • “Then optimize the modularity function to determine clusters”
# specify that we will perform downstream analysis on the corrected data note that the
# original unmodified data still resides in the 'RNA' assay
DefaultAssay(seurat_integrated) <- "integrated"

# Run the standard workflow for visualization and clustering
set.seed(1)
seurat_integrated <- ScaleData(seurat_integrated, verbose = FALSE)
seurat_integrated <- RunPCA(seurat_integrated, verbose = FALSE)
seurat_integrated <- RunTSNE(seurat_integrated, reduction = "pca", dims = 1:30)
seurat_integrated <- RunUMAP(seurat_integrated, reduction = "pca", dims = 1:30)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 18:31:55 UMAP embedding parameters a = 0.9922 b = 1.112
## 18:31:55 Read 2533 rows and found 30 numeric columns
## 18:31:55 Using Annoy for neighbor search, n_neighbors = 30
## 18:31:55 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 18:31:55 Writing NN index file to temp file C:\Users\Joseph\AppData\Local\Temp\RtmpuAJOIW\file3e4c184376e7
## 18:31:55 Searching Annoy index using 1 thread, search_k = 3000
## 18:31:55 Annoy recall = 100%
## 18:31:56 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 18:31:57 Initializing from normalized Laplacian + noise (using RSpectra)
## 18:31:57 Commencing optimization for 500 epochs, with 99716 positive edges
## 18:32:03 Optimization finished
seurat_integrated <- FindNeighbors(seurat_integrated, reduction = "pca", dims = 1:30)
## Computing nearest neighbor graph
## Computing SNN
seurat_integrated <- FindClusters(seurat_integrated, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 2533
## Number of edges: 134458
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8059
## Number of communities: 10
## Elapsed time: 0 seconds
seurat_integrated <- AddMetaData(seurat_integrated, factor(seurat_integrated$Batch), col.name = "Batch")
seurat_integrated <- AddMetaData(seurat_integrated, factor(seurat_integrated$Cluster), col.name = "Cluster")

seurat_final <- t(seurat_integrated@assays$integrated@scale.data)

Visualization of Integration

plot_title <- labs(
    title = "Integrated Data",
    subtitle = method_name
  )

# PCA
seurat_pca <- plot_grid(
  PCAPlot(seurat_integrated, group.by = "Batch")   + plot_title  + p1_options,
  PCAPlot(seurat_integrated, group.by = "Cluster") + ggtitle("") + p2_options,
  nrow = 2, align = "vh"
  )

# t-SNE
seurat_tsne <- plot_grid(
  TSNEPlot(seurat_integrated, group.by = "Batch")   + plot_title  + p1_options,
  TSNEPlot(seurat_integrated, group.by = "Cluster") + ggtitle("") + p2_options,
  nrow = 2, align = "vh"
  )

# UMAP
seurat_umap <- plot_grid(
  UMAPPlot(seurat_integrated, group.by = "Batch")   + plot_title  + p1_options,
  UMAPPlot(seurat_integrated, group.by = "Cluster") + ggtitle("") + p2_options,
  nrow = 2, align = "vh"
  )

seurat_pca

seurat_tsne

seurat_umap

kBET

# data: a matrix (rows: samples, columns: features (genes))
data <- Embeddings(seurat_integrated)

# batch: vector or factor with batch label of each cell 
batch <- batches

sample_size <- seq(0.05, 0.25, by = 0.05)
rejection_rate <- list()

set.seed(1)
for (i in 1:length(sample_size)) {
  k <- floor(nrow(data) * sample_size[i])
  
  cat("\nSample Size: ", sample_size[i], ", k = ", k, "\n", sep = "")
  cat("------------------------------\n")
 
  batch.estimate <- kBET(data, batch, k0 = k, plot = F, do.pca = F, verbose = T)
  
  cat("\n------------------------------\n")
  
  if (class(batch.estimate) == "list") {
    rejection_rate[[i]] <- batch.estimate$summary %>%
      rownames_to_column(var = "statistic") %>%
      mutate(sample_size = sample_size[i]) %>%
      select(sample_size, statistic, kBET_rr = kBET.observed)
  }
}
## 
## Sample Size: 0.05, k = 126
## ------------------------------
## finding knns...done. Time:
##    user  system elapsed 
##    0.62    0.00    0.62 
## Number of kBET tests is set to 254.
## There are 44 cells (1.737%) that do not appear in any neighbourhood.
## The expected frequencies for each category have been adapted.
## Cell indexes are saved to result list.
## 
## ------------------------------
## 
## Sample Size: 0.1, k = 253
## ------------------------------
## finding knns...done. Time:
##    user  system elapsed 
##    0.85    0.00    0.84 
## Number of kBET tests is set to 254.
## There are 24 cells (0.947%) that do not appear in any neighbourhood.
## The expected frequencies for each category have been adapted.
## Cell indexes are saved to result list.
## 
## ------------------------------
## 
## Sample Size: 0.15, k = 379
## ------------------------------
## finding knns...done. Time:
##    user  system elapsed 
##    1.12    0.00    1.12 
## Number of kBET tests is set to 254.
## There are 17 cells (0.671%) that do not appear in any neighbourhood.
## The expected frequencies for each category have been adapted.
## Cell indexes are saved to result list.
## 
## ------------------------------
## 
## Sample Size: 0.2, k = 506
## ------------------------------
## finding knns...done. Time:
##    user  system elapsed 
##    1.42    0.01    1.44 
## Number of kBET tests is set to 254.
## There are 14 cells (0.553%) that do not appear in any neighbourhood.
## The expected frequencies for each category have been adapted.
## Cell indexes are saved to result list.
## 
## ------------------------------
## 
## Sample Size: 0.25, k = 633
## ------------------------------
## finding knns...done. Time:
##    user  system elapsed 
##    1.77    0.00    1.77 
## Number of kBET tests is set to 254.
## No outsiders found.
## ------------------------------
seurat_kbet <- bind_rows(rejection_rate) %>%
  mutate(Method = method_name) %>%
  pivot_wider(
    id_cols = c("Method", "sample_size"),
    names_from = "statistic",
    values_from = "kBET_rr"
  ) %>%
  select(Method, sample_size, median_rr = `50%`)

seurat_kbet %>% write_csv(file = paste0("output/", method_name, "_", dataset_name, "_kBET.csv"))

Average Silhouette Width

asw_repeat <- 20
npcs <- 30

asw_batch <- list()
asw_cluster <- list()
subset_prop <- 0.8 # subsample to 80% of the data
subset_size_total <- floor(length(batch) * subset_prop)

for (i in 1:asw_repeat) {
  set.seed(i)
  subset_index <- 
    data.frame(index = 1:length(batch) , batch = batch) %>%
    slice_sample(n = subset_size_total, replace = F)
  
  subset_id <- subset_index %>%
    pull(index)
  
  seurat_tmp <- NULL
  seurat_tmp <-
    RunPCA(
      seurat_integrated[, subset_id],
      npcs = npcs,
      verbose = FALSE,
      reduction.name = "subset_pca",
      reduction.key = "sPC_",
      seed.use = NULL
    )
  data_pc <- seurat_tmp@reductions$subset_pca@cell.embeddings
  
  dissimilarity_matrix <- daisy(data_pc)
  
  asw_batch_i <- silhouette(as.integer(factor(batches[subset_id])), dissimilarity_matrix)
  asw_batch[[i]] <- as.data.frame(rbind(summary(asw_batch_i)[["si.summary"]]))
  
  asw_cluster_i <- silhouette(as.integer(factor(clusters[subset_id])), dissimilarity_matrix)
  asw_cluster[[i]] <- as.data.frame(rbind(summary(asw_cluster_i)[["si.summary"]]))
}

seurat_asw <- bind_rows(
  bind_rows(asw_batch) %>% mutate(Label = "Batch"),
  bind_rows(asw_cluster) %>% mutate(Label = "Cluster")
  ) %>%
  mutate(Method = method_name) %>%
  select(Method, Label, Mean)

seurat_asw %>% write_csv(file = paste0("output/", method_name, "_", dataset_name, "_ASW.csv"))

LISI Scores

# Performed for all cells
seurat_lisi <-
  compute_lisi(
    seurat_final,
    data.frame(Batch = batches, Cluster = clusters),
    c("Batch", "Cluster"),
    perplexity = 40
  ) %>%
  pivot_longer(
    cols = c("Batch", "Cluster"),
    names_to = "Label",
    values_to = "LISI"
  ) %>%
  mutate(Method = method_name)

seurat_lisi %>%
  write_csv(file = paste0("output/", method_name, "_", dataset_name, "_LISI.csv"))

Harmony

method_name <- "Harmony"

Run Integration Method

harmony_integrated <-
  runPCA(
    data_SCE,
    ncomponents = 30,
    ntop = nrow(data_SCE)
  )

harmony_integrated <-
  RunHarmony(
    harmony_integrated,
    "Batch",
    max.iter.harmony = 20
  )
## Harmony 1/20
## Harmony 2/20
## Harmony 3/20
## Harmony 4/20
## Harmony 5/20
## Harmony 6/20
## Harmony 7/20
## Harmony 8/20
## Harmony converged after 8 iterations
harmony_integrated <- runTSNE(harmony_integrated, dimred = "HARMONY")
harmony_integrated <- runUMAP(harmony_integrated, dimred = "HARMONY")

harmony_final <- reducedDim(harmony_integrated, "HARMONY")

Visualization of Integration

plot_title <- labs(title = "Integrated Data", subtitle = method_name)

# Harmony PCA
harmony_pca <- plot_grid(
  plotReducedDim(harmony_integrated, dimred = "HARMONY",
    color_by = "Batch",   ncomponents = 1:2 ) + plot_title + p1_options,
  plotReducedDim(harmony_integrated, dimred = "HARMONY",
    color_by = "Cluster", ncomponents = 1:2 ) + p2_options,
  nrow = 2, align = "vh"
  )
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
# t-SNE
harmony_tsne <- plot_grid(
  plotTSNE(harmony_integrated, color_by = "Batch")   + plot_title + p1_options,
  plotTSNE(harmony_integrated, color_by = "Cluster") + p2_options,
  nrow = 2, align = "vh"
  )
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
# UMAP
harmony_umap <- plot_grid(
  plotUMAP(harmony_integrated, color_by = "Batch")   + plot_title + p1_options,
  plotUMAP(harmony_integrated, color_by = "Cluster") + p2_options,
  nrow = 2, align = "vh"
  )
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
harmony_pca

harmony_tsne

harmony_umap

kBET

# data: a matrix (rows: samples, columns: features (genes))
data <- harmony_final[, 1:30]

# batch: vector or factor with batch label of each cell 
batch <- batches

sample_size <- seq(0.05, 0.25, by = 0.05)
rejection_rate <- list()

set.seed(1)
for (i in 1:length(sample_size)) {
  k <- floor(nrow(data) * sample_size[i])
  
  cat("\nSample Size: ", sample_size[i], ", k = ", k, "\n", sep = "")
  cat("------------------------------\n")
 
  batch.estimate <- kBET(data, batch, k0 = k, plot = F, do.pca = F, verbose = T)
  
  cat("\n------------------------------\n")
  
  if (class(batch.estimate) == "list") {
    rejection_rate[[i]] <- batch.estimate$summary %>%
      rownames_to_column(var = "statistic") %>%
      mutate(sample_size = sample_size[i]) %>%
      select(sample_size, statistic, kBET_rr = kBET.observed)
  }
}
## 
## Sample Size: 0.05, k = 126
## ------------------------------
## finding knns...done. Time:
##    user  system elapsed 
##    0.45    0.00    0.46 
## Number of kBET tests is set to 254.
## 
## ------------------------------
## 
## Sample Size: 0.1, k = 253
## ------------------------------
## finding knns...done. Time:
##    user  system elapsed 
##    0.63    0.00    0.63 
## Number of kBET tests is set to 254.
## 
## ------------------------------
## 
## Sample Size: 0.15, k = 379
## ------------------------------
## finding knns...done. Time:
##    user  system elapsed 
##    0.81    0.00    0.81 
## Number of kBET tests is set to 254.
## 
## ------------------------------
## 
## Sample Size: 0.2, k = 506
## ------------------------------
## finding knns...done. Time:
##    user  system elapsed 
##    1.04    0.00    1.03 
## Number of kBET tests is set to 254.
## 
## ------------------------------
## 
## Sample Size: 0.25, k = 633
## ------------------------------
## finding knns...done. Time:
##    user  system elapsed 
##    1.26    0.02    1.29 
## Number of kBET tests is set to 254.
## 
## ------------------------------
harmony_kbet <- bind_rows(rejection_rate) %>%
  mutate(Method = method_name) %>%
  pivot_wider(
    id_cols = c("Method", "sample_size"),
    names_from = "statistic",
    values_from = "kBET_rr"
  ) %>%
  select(Method, sample_size, median_rr = `50%`)

harmony_kbet %>% write_csv(file = paste0("output/", method_name, "_", dataset_name, "_kBET.csv"))

Average Silhouette Width

asw_repeat <- 20
npcs <- 30

asw_batch <- list()
asw_cluster <- list()
subset_prop <- 0.8 # subsample to 80% of the data
subset_size_total <- floor(length(batch) * subset_prop)

for (i in 1:asw_repeat) {
  set.seed(i)
  subset_index <- 
    data.frame(index = 1:length(batch) , batch = batch) %>%
    slice_sample(n = subset_size_total, replace = F)
  
  subset_id <- subset_index %>%
    pull(index)
  
  data_pc <- svds(harmony_final[subset_id, ], k = npcs)$u
  
  dissimilarity_matrix <- daisy(data_pc)
  
  asw_batch_i <- silhouette(as.integer(factor(batches[subset_id])), dissimilarity_matrix)
  asw_batch[[i]] <- as.data.frame(rbind(summary(asw_batch_i)[["si.summary"]]))
  
  asw_cluster_i <- silhouette(as.integer(factor(clusters[subset_id])), dissimilarity_matrix)
  asw_cluster[[i]] <- as.data.frame(rbind(summary(asw_cluster_i)[["si.summary"]]))
}
## Warning in fun(A, k, nu, nv, opts, mattype = "matrix"): all singular values are
## requested, svd() is used instead

## Warning in fun(A, k, nu, nv, opts, mattype = "matrix"): all singular values are
## requested, svd() is used instead

## Warning in fun(A, k, nu, nv, opts, mattype = "matrix"): all singular values are
## requested, svd() is used instead

## Warning in fun(A, k, nu, nv, opts, mattype = "matrix"): all singular values are
## requested, svd() is used instead

## Warning in fun(A, k, nu, nv, opts, mattype = "matrix"): all singular values are
## requested, svd() is used instead

## Warning in fun(A, k, nu, nv, opts, mattype = "matrix"): all singular values are
## requested, svd() is used instead

## Warning in fun(A, k, nu, nv, opts, mattype = "matrix"): all singular values are
## requested, svd() is used instead

## Warning in fun(A, k, nu, nv, opts, mattype = "matrix"): all singular values are
## requested, svd() is used instead

## Warning in fun(A, k, nu, nv, opts, mattype = "matrix"): all singular values are
## requested, svd() is used instead

## Warning in fun(A, k, nu, nv, opts, mattype = "matrix"): all singular values are
## requested, svd() is used instead

## Warning in fun(A, k, nu, nv, opts, mattype = "matrix"): all singular values are
## requested, svd() is used instead

## Warning in fun(A, k, nu, nv, opts, mattype = "matrix"): all singular values are
## requested, svd() is used instead

## Warning in fun(A, k, nu, nv, opts, mattype = "matrix"): all singular values are
## requested, svd() is used instead

## Warning in fun(A, k, nu, nv, opts, mattype = "matrix"): all singular values are
## requested, svd() is used instead

## Warning in fun(A, k, nu, nv, opts, mattype = "matrix"): all singular values are
## requested, svd() is used instead

## Warning in fun(A, k, nu, nv, opts, mattype = "matrix"): all singular values are
## requested, svd() is used instead

## Warning in fun(A, k, nu, nv, opts, mattype = "matrix"): all singular values are
## requested, svd() is used instead

## Warning in fun(A, k, nu, nv, opts, mattype = "matrix"): all singular values are
## requested, svd() is used instead

## Warning in fun(A, k, nu, nv, opts, mattype = "matrix"): all singular values are
## requested, svd() is used instead

## Warning in fun(A, k, nu, nv, opts, mattype = "matrix"): all singular values are
## requested, svd() is used instead
harmony_asw <- bind_rows(
  bind_rows(asw_batch) %>% mutate(Label = "Batch"),
  bind_rows(asw_cluster) %>% mutate(Label = "Cluster")
  ) %>%
  mutate(Method = method_name) %>%
  select(Method, Label, Mean)

harmony_asw %>% write_csv(file = paste0("output/", method_name, "_", dataset_name, "_ASW.csv"))

LISI Scores

# Performed for all cells
harmony_lisi <-
  compute_lisi(
    harmony_final,
    data.frame(Batch = batches, Cluster = clusters),
    c("Batch", "Cluster"),
    perplexity = 40
  ) %>%
  pivot_longer(
    cols = c("Batch", "Cluster"),
    names_to = "Label",
    values_to = "LISI"
  ) %>%
  mutate(Method = method_name)

harmony_lisi %>%
  write_csv(file = paste0("output/", method_name, "_", dataset_name, "_LISI.csv"))

Integration Method Comparison

method_lvl <- c("JIVE", "Seurat", "Harmony", "Raw")

t-SNE Plots

# JIVE
jive_tsne_df <- reducedDim(jive_integrated, "TSNE") %>%
  as.data.frame() %>%
  mutate(Method = "JIVE") %>%
  mutate(Batch = batches, Cluster = clusters)

jive_tsne_batch <- jive_tsne_df %>%
  ggplot(aes(x = V1, y = V2, color = Batch)) +
  geom_point(alpha = 0.5) +
  theme_nothing() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"), legend.position = "right") +
  labs(title = "JIVE") + p1_options

jive_tsne_cluster <- jive_tsne_df %>%
  ggplot(aes(x = V1, y = V2, color = Cluster)) +
  geom_point(alpha = 0.5) +
  theme_nothing() +
  theme(legend.position = "right") + p2_options

jive_tsne_plot <- plot_grid(
  jive_tsne_batch + theme(legend.position = "none"),
  jive_tsne_cluster + theme(legend.position = "none"),
  nrow = 2, align = "hv"
) + theme(plot.background = element_rect(color = "black"))

# Seurat
seurat_tsne_df <- Embeddings(seurat_integrated, "tsne") %>%
  as.data.frame() %>%
  mutate(Method = "Seurat") %>%
  mutate(Batch = batches, Cluster = clusters)

seurat_tsne_batch <- seurat_tsne_df %>%
  ggplot(aes(x = tSNE_1, y = tSNE_2, color = Batch)) +
  geom_point(alpha = 0.5) +
  theme_nothing() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
  labs(title = "Seurat") + p1_options

seurat_tsne_cluster <- seurat_tsne_df %>%
  ggplot(aes(x = tSNE_1, y = tSNE_2, color = Cluster)) +
  geom_point(alpha = 0.5) +
  theme_nothing() +
  p2_options

seurat_tsne_plot <- plot_grid(
  seurat_tsne_batch,
  seurat_tsne_cluster,
  nrow = 2, align = "hv"
) + theme(plot.background = element_rect(color = "black"))

# Harmony
harmony_tsne_df <- reducedDim(harmony_integrated, "TSNE") %>%
  as.data.frame() %>%
  mutate(Method = "Harmony") %>%
  mutate(Batch = batches, Cluster = clusters)

harmony_tsne_batch <- harmony_tsne_df %>%
  ggplot(aes(x = V1, y = V2, color = Batch)) +
  geom_point(alpha = 0.5) +
  theme_nothing() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
  labs(title = "Harmony") + p1_options

harmony_tsne_cluster <- harmony_tsne_df %>%
  ggplot(aes(x = V1, y = V2, color = Cluster)) +
  geom_point(alpha = 0.5) +
  theme_nothing() +
  p2_options

harmony_tsne_plot <- plot_grid(
  harmony_tsne_batch,
  harmony_tsne_cluster,
  nrow = 2, align = "hv"
) + theme(plot.background = element_rect(color = "black"))

# Raw
raw_tsne_df <- reducedDim(data_SCE, "TSNE") %>%
  as.data.frame() %>%
  mutate(Method = "Raw") %>%
  mutate(Batch = batches, Cluster = clusters)

raw_tsne_batch <- raw_tsne_df %>%
  ggplot(aes(x = V1, y = V2, color = Batch)) +
  geom_point(alpha = 0.5) +
  theme_nothing() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
  labs(title = "Raw") + p1_options

raw_tsne_cluster <- raw_tsne_df %>%
  ggplot(aes(x = V1, y = V2, color = Cluster)) +
  geom_point(alpha = 0.5) +
  theme_nothing() +
  p2_options

raw_tsne_plot <- plot_grid(
  raw_tsne_batch,
  raw_tsne_cluster,
  nrow = 2, align = "hv"
) + theme(plot.background = element_rect(color = "black"))

# All
legend_batch <- get_legend(
  # create some space to the left of the legend
  jive_tsne_batch + theme(legend.box.margin = margin(0, 0, 0, 12))
)

legend_cluster <- get_legend(
  # create some space to the left of the legend
  jive_tsne_cluster + theme(legend.box.margin = margin(0, 0, 0, 12))
)

tsne_prow <- plot_grid(
  jive_tsne_plot,
  seurat_tsne_plot,
  harmony_tsne_plot,
  raw_tsne_plot,
  nrow = 1,
  hjust = -1
  )

tsne_prow_grob <- arrangeGrob(
  tsne_prow,
  left = textGrob("t-SNE 2", rot = 90),
  bottom = textGrob("t-SNE 1")
)

tsne_legend_both <- plot_grid(legend_batch, legend_cluster, nrow = 2, align = "v")

all_tsne_plot <- plot_grid(tsne_prow_grob, tsne_legend_both, rel_widths = c(3, 1))

all_tsne_plot

ggsave("output/tsne_zilionisdata.png", all_tsne_plot, width = 9, height = 4)

UMAP Plots

# JIVE
jive_umap_df <- reducedDim(jive_integrated, "UMAP") %>%
  as.data.frame() %>%
  mutate(Method = "JIVE") %>%
  mutate(Batch = batches, Cluster = clusters)

jive_umap_batch <- jive_umap_df %>%
  ggplot(aes(x = V1, y = V2, color = Batch)) +
  geom_point(alpha = 0.5) +
  theme_nothing() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"), legend.position = "right") +
  labs(title = "JIVE") + p1_options

jive_umap_cluster <- jive_umap_df %>%
  ggplot(aes(x = V1, y = V2, color = Cluster)) +
  geom_point(alpha = 0.5) +
  theme_nothing() +
  theme(legend.position = "right") + p2_options

jive_umap_plot <- plot_grid(
  jive_umap_batch + theme(legend.position = "none"),
  jive_umap_cluster + theme(legend.position = "none"),
  nrow = 2, align = "hv"
) + theme(plot.background = element_rect(color = "black"))

# Seurat
seurat_umap_df <- Embeddings(seurat_integrated, "umap") %>%
  as.data.frame() %>%
  mutate(Method = "Seurat") %>%
  mutate(Batch = batches, Cluster = clusters)

seurat_umap_batch <- seurat_umap_df %>%
  ggplot(aes(x = UMAP_1, y = UMAP_2, color = Batch)) +
  geom_point(alpha = 0.5) +
  theme_nothing() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
  labs(title = "Seurat") + p1_options

seurat_umap_cluster <- seurat_umap_df %>%
  ggplot(aes(x = UMAP_1, y = UMAP_2, color = Cluster)) +
  geom_point(alpha = 0.5) +
  theme_nothing() +
  p2_options

seurat_umap_plot <- plot_grid(
  seurat_umap_batch,
  seurat_umap_cluster,
  nrow = 2, align = "hv"
) + theme(plot.background = element_rect(color = "black"))

# Harmony
harmony_umap_df <- reducedDim(harmony_integrated, "UMAP") %>%
  as.data.frame() %>%
  mutate(Method = "Harmony") %>%
  mutate(Batch = batches, Cluster = clusters)

harmony_umap_batch <- harmony_umap_df %>%
  ggplot(aes(x = V1, y = V2, color = Batch)) +
  geom_point(alpha = 0.5) +
  theme_nothing() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
  labs(title = "Harmony") + p1_options

harmony_umap_cluster <- harmony_umap_df %>%
  ggplot(aes(x = V1, y = V2, color = Cluster)) +
  geom_point(alpha = 0.5) +
  theme_nothing() +
  p2_options

harmony_umap_plot <- plot_grid(
  harmony_umap_batch,
  harmony_umap_cluster,
  nrow = 2, align = "hv"
) + theme(plot.background = element_rect(color = "black"))

# Raw
raw_umap_df <- reducedDim(data_SCE, "UMAP") %>%
  as.data.frame() %>%
  mutate(Method = "Raw") %>%
  mutate(Batch = batches, Cluster = clusters)

raw_umap_batch <- raw_umap_df %>%
  ggplot(aes(x = V1, y = V2, color = Batch)) +
  geom_point(alpha = 0.5) +
  theme_nothing() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
  labs(title = "Raw") + p1_options

raw_umap_cluster <- raw_umap_df %>%
  ggplot(aes(x = V1, y = V2, color = Cluster)) +
  geom_point(alpha = 0.5) +
  theme_nothing() +
  p2_options

raw_umap_plot <- plot_grid(
  raw_umap_batch,
  raw_umap_cluster,
  nrow = 2, align = "hv"
) + theme(plot.background = element_rect(color = "black"))

# All
legend_batch <- get_legend(
  # create some space to the left of the legend
  jive_umap_batch + theme(legend.box.margin = margin(0, 0, 0, 12))
)

legend_cluster <- get_legend(
  # create some space to the left of the legend
  jive_umap_cluster + theme(legend.box.margin = margin(0, 0, 0, 12))
)

umap_prow <- plot_grid(
  jive_umap_plot,
  seurat_umap_plot,
  harmony_umap_plot,
  raw_umap_plot,
  nrow = 1,
  hjust = -1
  )

umap_prow_grob <- arrangeGrob(
  umap_prow,
  left = textGrob("UMAP 2", rot = 90),
  bottom = textGrob("UMAP 1")
)

umap_legend_both <- plot_grid(legend_batch, legend_cluster, nrow = 2, align = "v")

all_umap_plot <- plot_grid(umap_prow_grob, umap_legend_both, rel_widths = c(3, 1))

all_umap_plot

ggsave("output/umap_zilionisdata.png", all_umap_plot, width = 9, height = 4)

kBET

all_kbet <- bind_rows(jive_kbet, seurat_kbet, harmony_kbet, raw_kbet) %>%
  mutate(Method = factor(Method, levels = method_lvl))

kbet_plot <- all_kbet %>%
  ggplot(aes(
    x = sample_size,
    y = I(1 - median_rr),
    group = Method,
    color = Method
  )) +  geom_line(linewidth = 1) +
  geom_point(size = 2) +
  scale_x_continuous(labels = scales::percent) +
  expand_limits(y = c(0, 1)) +
  labs(
    x = "% sample size",
    y = "kBET (acceptance rate)",
    color = "Integration Method"
    )

kbet_plot + labs(title = "kBET Acceptance Rates")

Average Silhouette Width (ASW)

all_asw <- bind_rows(jive_asw, seurat_asw, harmony_asw, raw_asw) %>%
  mutate(Method = factor(Method, levels = method_lvl))

asw_plot <- all_asw %>%
  group_by(Label) %>%
  mutate(Mean = scales::rescale(Mean)) %>%
  group_by(Method, Label) %>%
  summarize(median_ASW = median(Mean), .groups = "keep") %>%
  pivot_wider(
    id_cols = "Method",
    names_from = "Label",
    values_from = "median_ASW"
  ) %>% 
  ggplot(aes(x = Cluster, y = I(1 - Batch), color = Method)) +
  geom_point(size = 2) +
  geom_text_repel(aes(label = Method), show.legend = FALSE) +
  labs(x = "ASW cell type", y = "1 - ASW batch")

asw_plot + labs(title = "Average Silhouette Width Scores")

LISI Scores

all_lisi <- bind_rows(jive_lisi, seurat_lisi, harmony_lisi, raw_lisi) %>%
  mutate(Method = factor(Method, levels = method_lvl))

lisi_plot <- all_lisi %>%
  group_by(Label) %>%
  mutate(LISI = scales::rescale(LISI)) %>%
  group_by(Method, Label) %>%
  summarize(median_LISI = median(LISI), .groups = "keep") %>%
  pivot_wider(
    id_cols = "Method",
    names_from = "Label",
    values_from = "median_LISI"
  ) %>%
  ggplot(aes(x = I(1 - Cluster), y = Batch, color = Method)) +
  geom_point(size = 2) +
  geom_text_repel(aes(label = Method), show.legend = FALSE) +
  labs(x = "1 - cLISI cell type", y = "iLISI batch", )

lisi_plot + labs(title = "LISI Scores")

All Metrics

# arrange the three plots in a single row
prow <- plot_grid(
  kbet_plot + theme(legend.position = "none"),
  asw_plot + theme(legend.position = "none"),
  lisi_plot + theme(legend.position = "none"),
  align = "hv",
  labels = c("A", "B", "C"),
  hjust = -1,
  nrow = 1
)

# extract the legend from one of the plots
legend <- get_legend(
  # create some space to the left of the legend
  kbet_plot + guides(color = guide_legend(nrow = 1)) + theme(legend.position = "bottom")
    
)

# add the legend to the row we made earlier. Give it one-third of 
# the width of one plot (via rel_widths).
all_metrics_lot <- plot_grid(prow, legend, ncol = 1, rel_heights = c(1, .1))
all_metrics_lot

ggsave("output/metrics_zilionisdata.png", all_metrics_lot, width = 9, height = 4)

Session Information

sessionInfo()
## R version 4.2.2 (2022-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19045)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## attached base packages:
## [1] grid      stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] lisi_1.0                    cluster_2.1.4              
##  [3] kBET_0.99.6                 lme4_1.1-31                
##  [5] Matrix_1.5-3                scRNAseq_2.12.0            
##  [7] RSpectra_0.16-1             harmony_0.1.1              
##  [9] Rcpp_1.0.9                  SeuratObject_4.1.3         
## [11] Seurat_4.3.0                r.jive_2.4                 
## [13] gridExtra_2.3               kableExtra_1.3.4           
## [15] scran_1.26.2                scater_1.26.1              
## [17] scuttle_1.8.4               splatter_1.22.1            
## [19] SingleCellExperiment_1.20.0 SummarizedExperiment_1.28.0
## [21] Biobase_2.58.0              GenomicRanges_1.50.2       
## [23] GenomeInfoDb_1.34.9         IRanges_2.32.0             
## [25] S4Vectors_0.36.2            BiocGenerics_0.44.0        
## [27] MatrixGenerics_1.10.0       matrixStats_0.63.0         
## [29] ggrepel_0.9.2               ggthemes_4.2.4             
## [31] ggExtra_0.10.0              cowplot_1.1.1              
## [33] lubridate_1.9.2             forcats_1.0.0              
## [35] stringr_1.5.0               dplyr_1.1.0                
## [37] purrr_1.0.1                 readr_2.1.4                
## [39] tidyr_1.3.0                 tibble_3.1.8               
## [41] ggplot2_3.4.1               tidyverse_2.0.0            
## [43] tictoc_1.1                 
## 
## loaded via a namespace (and not attached):
##   [1] rappdirs_0.3.3                rtracklayer_1.58.0           
##   [3] scattermore_0.8               ragg_1.2.5                   
##   [5] bit64_4.0.5                   knitr_1.42                   
##   [7] irlba_2.3.5.1                 DelayedArray_0.24.0          
##   [9] data.table_1.14.6             AnnotationFilter_1.22.0      
##  [11] KEGGREST_1.38.0               RCurl_1.98-1.10              
##  [13] generics_0.1.3                GenomicFeatures_1.50.4       
##  [15] ScaledMatrix_1.6.0            RSQLite_2.3.0                
##  [17] RANN_2.6.1                    future_1.31.0                
##  [19] bit_4.0.5                     tzdb_0.3.0                   
##  [21] spatstat.data_3.0-0           webshot_0.5.4                
##  [23] xml2_1.3.3                    httpuv_1.6.8                 
##  [25] viridis_0.6.2                 xfun_0.37                    
##  [27] hms_1.1.2                     jquerylib_0.1.4              
##  [29] evaluate_0.20                 promises_1.2.0.1             
##  [31] restfulr_0.0.15               progress_1.2.2               
##  [33] fansi_1.0.4                   caTools_1.18.2               
##  [35] dbplyr_2.3.1                  igraph_1.3.5                 
##  [37] DBI_1.1.3                     htmlwidgets_1.6.1            
##  [39] spatstat.geom_3.0-5           RcppArmadillo_0.11.4.3.1     
##  [41] ellipsis_0.3.2                backports_1.4.1              
##  [43] bookdown_0.32                 biomaRt_2.54.0               
##  [45] deldir_1.0-6                  sparseMatrixStats_1.10.0     
##  [47] vctrs_0.5.2                   ensembldb_2.22.0             
##  [49] ROCR_1.0-11                   abind_1.4-5                  
##  [51] RcppEigen_0.3.3.9.3           cachem_1.0.7                 
##  [53] withr_2.5.0                   progressr_0.13.0             
##  [55] vroom_1.6.1                   checkmate_2.1.0              
##  [57] sctransform_0.3.5             GenomicAlignments_1.34.0     
##  [59] prettyunits_1.1.1             goftest_1.2-3                
##  [61] svglite_2.1.1                 ExperimentHub_2.6.0          
##  [63] lazyeval_0.2.2                crayon_1.5.2                 
##  [65] spatstat.explore_3.0-6        labeling_0.4.2               
##  [67] edgeR_3.40.2                  pkgconfig_2.0.3              
##  [69] ProtGenerics_1.30.0           nlme_3.1-161                 
##  [71] vipor_0.4.5                   rlang_1.0.6                  
##  [73] globals_0.16.2                lifecycle_1.0.3              
##  [75] miniUI_0.1.1.1                filelock_1.0.2               
##  [77] BiocFileCache_2.6.1           rsvd_1.0.5                   
##  [79] AnnotationHub_3.6.0           polyclip_1.10-4              
##  [81] lmtest_0.9-40                 boot_1.3-28.1                
##  [83] zoo_1.8-11                    beeswarm_0.4.0               
##  [85] ggridges_0.5.4                rjson_0.2.21                 
##  [87] png_0.1-8                     viridisLite_0.4.1            
##  [89] bitops_1.0-7                  KernSmooth_2.23-20           
##  [91] Biostrings_2.66.0             blob_1.2.3                   
##  [93] DelayedMatrixStats_1.20.0     parallelly_1.34.0            
##  [95] spatstat.random_3.1-3         beachmat_2.14.0              
##  [97] scales_1.2.1                  memoise_2.0.1                
##  [99] magrittr_2.0.3                plyr_1.8.8                   
## [101] ica_1.0-3                     gplots_3.1.3                 
## [103] zlibbioc_1.44.0               compiler_4.2.2               
## [105] BiocIO_1.8.0                  dqrng_0.3.0                  
## [107] RColorBrewer_1.1-3            fitdistrplus_1.1-8           
## [109] Rsamtools_2.14.0              cli_3.4.1                    
## [111] XVector_0.38.0                listenv_0.9.0                
## [113] patchwork_1.1.2               pbapply_1.7-0                
## [115] MASS_7.3-58.2                 tidyselect_1.2.0             
## [117] stringi_1.7.12                textshaping_0.3.6            
## [119] highr_0.10                    yaml_2.3.7                   
## [121] BiocSingular_1.14.0           locfit_1.5-9.7               
## [123] sass_0.4.5                    tools_4.2.2                  
## [125] timechange_0.2.0              future.apply_1.10.0          
## [127] parallel_4.2.2                rstudioapi_0.14              
## [129] bluster_1.8.0                 metapod_1.6.0                
## [131] farver_2.1.1                  rmdformats_1.0.4             
## [133] Rtsne_0.16                    digest_0.6.30                
## [135] BiocManager_1.30.20           FNN_1.1.3.1                  
## [137] shiny_1.7.4                   BiocVersion_3.16.0           
## [139] later_1.3.0                   RcppAnnoy_0.0.20             
## [141] httr_1.4.5                    AnnotationDbi_1.60.0         
## [143] colorspace_2.1-0              rvest_1.0.3                  
## [145] XML_3.99-0.13                 tensor_1.5                   
## [147] reticulate_1.27               splines_4.2.2                
## [149] uwot_0.1.14                   statmod_1.5.0                
## [151] spatstat.utils_3.0-1          sp_1.6-0                     
## [153] plotly_4.10.1                 systemfonts_1.0.4            
## [155] xtable_1.8-4                  nloptr_2.0.3                 
## [157] jsonlite_1.8.4                R6_2.5.1                     
## [159] pillar_1.8.1                  htmltools_0.5.4              
## [161] mime_0.12                     minqa_1.2.5                  
## [163] glue_1.6.2                    fastmap_1.1.0                
## [165] BiocParallel_1.32.5           BiocNeighbors_1.16.0         
## [167] interactiveDisplayBase_1.36.0 codetools_0.2-19             
## [169] utf8_1.2.2                    lattice_0.20-45              
## [171] bslib_0.4.2                   spatstat.sparse_3.0-0        
## [173] curl_5.0.0                    ggbeeswarm_0.7.1             
## [175] leiden_0.4.3                  gtools_3.9.4                 
## [177] survival_3.5-0                limma_3.54.2                 
## [179] rmarkdown_2.20                munsell_0.5.0                
## [181] GenomeInfoDbData_1.2.9        reshape2_1.4.4               
## [183] gtable_0.3.1